# =============================================================================
# FIXED EFFECTS MODEL (Model 1) + MORAN'S I (Queen, per year) - 2016-2023
# =============================================================================

import pandas as pd
import numpy as np
from linearmodels.panel import PanelOLS
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import geopandas as gpd
from libpysal.weights import Queen
from esda.moran import Moran
from datetime import datetime

print("=== FIXED EFFECTS MODEL (Model 1) + MORAN'S I (Queen) ===")

# --------------------------------------------------
# 1. Load data
# --------------------------------------------------
df = pd.read_csv("dataset.csv")
print(f"Loaded: {len(df):,} observations (2016-2023)")

df['prov_sec'] = df['province_id'].astype(str) + "_" + df['sector_id']
df['prov_norm'] = df['province_name'].str.lower().str.strip()

# --------------------------------------------------
# 2. Variables
# --------------------------------------------------
exog_vars = ['spe', 'div', 'den', 'ln_fdi', 'ln_productivity_lag', 'skilled_share']

# --------------------------------------------------
# 3. VIF Check
# --------------------------------------------------
print("\nVIF CHECK:")
X = sm.add_constant(df[exog_vars])
for i, var in enumerate(['const'] + exog_vars):
    vif = variance_inflation_factor(X.values, i)
    print(f"{var:20} VIF = {vif:6.2f}")

# --------------------------------------------------
# 4. Fixed Effects Estimation
# --------------------------------------------------
panel_df = df.set_index(['prov_sec', 'year'])

model = PanelOLS(panel_df['ln_wage'], 
                 panel_df[exog_vars],
                 entity_effects=True,
                 time_effects=True,
                 drop_absorbed=True)

results = model.fit(cov_type='clustered', clusters=panel_df['province_id'])

print("\n" + "="*90)
print("FIXED EFFECTS MODEL (Model 1) — 2016-2023")
print("Province×Sector + Year Fixed Effects")
print("="*90)
print(results.summary)

# Clean results table
table = pd.DataFrame({
    'Coefficient': results.params.round(4),
    'Std. Error': results.std_errors.round(4),
    't-stat': results.tstats.round(3),
    'P>|t|': results.pvalues.round(4)
}).loc[exog_vars]

print("\nMAIN RESULTS TABLE:")
print(table)

# --------------------------------------------------
# 5. Moran's I on Province-Averaged Residuals (Queen)
# --------------------------------------------------
print("\n" + "="*80)
print("SPATIAL DIAGNOSTICS: Moran's I (Queen contiguity, province level)")
print("="*80)

gdf = gpd.read_file("province_vn.geojson")
gdf['norm'] = gdf['ten_tinh'].str.lower().str.strip()

common = set(df['prov_norm']) & set(gdf['norm'])
gdf = gdf[gdf['norm'].isin(common)].copy().reset_index(drop=True)

w = Queen.from_dataframe(gdf)
w.transform = 'r'
print(f"Queen weights: {w.n} provinces, avg neighbors = {w.mean_neighbors:.2f}")

# Name-to-index mapping for alignment with weight matrix
name_to_idx = dict(zip(gdf['norm'], range(len(gdf))))

# Add residuals to df
residuals = results.resids.copy()
residuals.index = panel_df.index
df = df.set_index(['prov_sec', 'year'])
df['resid'] = residuals

print("\nMoran's I by year:")
print("-" * 60)
moran_results = []

for year in sorted(df.index.get_level_values('year').unique()):
    year_df = df[df.index.get_level_values('year') == year].reset_index()
    
    # Mean residual per province this year
    prov_resid = year_df.groupby('prov_norm')['resid'].mean()
    
    # Create vector matching the exact order of gdf / w
    resid_vec = np.zeros(w.n)
    for norm_name, res_value in prov_resid.items():
        if norm_name in name_to_idx:
            idx = name_to_idx[norm_name]
            resid_vec[idx] = res_value
    
    mi = Moran(resid_vec, w, permutations=999)
    moran_results.append((year, mi.I, mi.p_sim))
    
    sig = "***" if mi.p_sim < 0.01 else "**" if mi.p_sim < 0.05 else "*" if mi.p_sim < 0.10 else ""
    print(f"Year {year}: Moran's I = {mi.I:.4f}{sig}   p-value = {mi.p_sim:.4f}")

avg_i = np.mean([r[1] for r in moran_results])
avg_p = np.mean([r[2] for r in moran_results])
print(f"\nAverage Moran's I across years = {avg_i:.4f} (avg p = {avg_p:.4f})")

if avg_p < 0.05:
    print("→ Strong evidence of spatial autocorrelation in residuals → SDM recommended")
elif avg_p < 0.10:
    print("→ Moderate spatial autocorrelation detected")

# --------------------------------------------------
# 6. Save results
# --------------------------------------------------
with open("baseline_fe_with_morans_I.txt", "w", encoding="utf-8") as f:
    f.write("FIXED EFFECTS MODEL (Model 1) + MORAN'S I (Queen)\n")
    f.write(f"Date: {datetime.now().strftime('%B %d, %Y at %H:%M')}\n\n")
    f.write(table.to_string(float_format="{:0.4f}".format))
    f.write("\n\nMoran's I by year (Queen contiguity):\n")
    for y, i_val, p_val in moran_results:
        f.write(f"Year {y}: I = {i_val:.4f}, p = {p_val:.4f}\n")
    f.write(f"\nAverage I = {avg_i:.4f} (avg p = {avg_p:.4f})\n")

print("\n✅ DONE! Results saved to: baseline_fe_with_morans_I.txt")